# Importing required libraries
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import hvplot.pandas
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report
# Read the table
data=pd.read_csv("../Resources/Project_Indicators_Wide.csv")
data.head()
| country_code | country | region | status | year | ALC | AMR | BCG | CANP | DTP | ... | HDI | HE | INCI | INFMR | LE | MCV | OBP | POPD | POPG | SR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ABW | Aruba | Latin America & Caribbean | NaN | 2000 | NaN | 112.4760 | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | 73.787 | NaN | NaN | 504.811111 | 2.064841 | 93.3 |
| 1 | ABW | Aruba | Latin America & Caribbean | NaN | 2001 | NaN | 111.9155 | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | 73.853 | NaN | NaN | 516.066667 | 2.205163 | 93.3 |
| 2 | ABW | Aruba | Latin America & Caribbean | NaN | 2002 | NaN | 111.3550 | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | 73.937 | NaN | NaN | 527.733333 | 2.235515 | 93.3 |
| 3 | ABW | Aruba | Latin America & Caribbean | NaN | 2003 | NaN | 109.9290 | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | 74.038 | NaN | NaN | 538.977778 | 2.108324 | 93.3 |
| 4 | ABW | Aruba | Latin America & Caribbean | NaN | 2004 | NaN | 108.5030 | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | 74.156 | NaN | NaN | 548.577778 | 1.765473 | 93.3 |
5 rows × 23 columns
# Dropping NA's
df=data.dropna()
df.head()
| country_code | country | region | status | year | ALC | AMR | BCG | CANP | DTP | ... | HDI | HE | INCI | INFMR | LE | MCV | OBP | POPD | POPG | SR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 20 | AFG | Afghanistan | South Asia | Developing | 2000 | 0.002667 | 310.8305 | 30.0 | 0.502297 | 48.0 | ... | 0.350 | 9.0 | 0.333 | 90.5 | 55.841 | 27.0 | 2.3 | 31.829117 | 2.975057 | 105.9 |
| 21 | AFG | Afghanistan | South Asia | Developing | 2001 | 0.005333 | 304.8580 | 43.0 | 0.508536 | 59.0 | ... | 0.353 | 9.0 | 0.318 | 87.9 | 56.308 | 37.0 | 2.4 | 33.095904 | 3.902805 | 105.9 |
| 22 | AFG | Afghanistan | South Asia | Developing | 2002 | 0.008000 | 298.8855 | 46.0 | 0.512110 | 62.0 | ... | 0.384 | 9.0 | 0.386 | 85.3 | 56.784 | 35.0 | 2.6 | 34.618102 | 4.496719 | 105.9 |
| 23 | AFG | Afghanistan | South Asia | Developing | 2003 | 0.010667 | 292.0365 | 44.0 | 0.515965 | 66.0 | ... | 0.393 | 9.0 | 0.391 | 82.7 | 57.271 | 39.0 | 2.7 | 36.272510 | 4.668344 | 105.9 |
| 24 | AFG | Afghanistan | South Asia | Developing | 2004 | 0.013333 | 285.1880 | 51.0 | 0.520604 | 72.0 | ... | 0.409 | 10.0 | 0.389 | 80.0 | 57.772 | 48.0 | 2.9 | 37.874413 | 4.321560 | 105.9 |
5 rows × 23 columns
df.dtypes
country_code object country object region object status object year int64 ALC float64 AMR float64 BCG float64 CANP float64 DTP float64 EDI float64 GDP float64 GDPG float64 HDI float64 HE float64 INCI float64 INFMR float64 LE float64 MCV float64 OBP float64 POPD float64 POPG float64 SR float64 dtype: object
# Dropping unwanted columns
select_df=df.drop(columns=["country_code","country","year","status"])
select_df.head()
| region | ALC | AMR | BCG | CANP | DTP | EDI | GDP | GDPG | HDI | HE | INCI | INFMR | LE | MCV | OBP | POPD | POPG | SR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 20 | South Asia | 0.002667 | 310.8305 | 30.0 | 0.502297 | 48.0 | 0.235 | 4.055180e+09 | 3.868380 | 0.350 | 9.0 | 0.333 | 90.5 | 55.841 | 27.0 | 2.3 | 31.829117 | 2.975057 | 105.9 |
| 21 | South Asia | 0.005333 | 304.8580 | 43.0 | 0.508536 | 59.0 | 0.247 | 4.055180e+09 | 3.868380 | 0.353 | 9.0 | 0.318 | 87.9 | 56.308 | 37.0 | 2.4 | 33.095904 | 3.902805 | 105.9 |
| 22 | South Asia | 0.008000 | 298.8855 | 46.0 | 0.512110 | 62.0 | 0.259 | 4.055180e+09 | 3.868380 | 0.384 | 9.0 | 0.386 | 85.3 | 56.784 | 35.0 | 2.6 | 34.618102 | 4.496719 | 105.9 |
| 23 | South Asia | 0.010667 | 292.0365 | 44.0 | 0.515965 | 66.0 | 0.271 | 4.515559e+09 | 3.868380 | 0.393 | 9.0 | 0.391 | 82.7 | 57.271 | 39.0 | 2.7 | 36.272510 | 4.668344 | 105.9 |
| 24 | South Asia | 0.013333 | 285.1880 | 51.0 | 0.520604 | 72.0 | 0.302 | 5.226779e+09 | -2.875203 | 0.409 | 10.0 | 0.389 | 80.0 | 57.772 | 48.0 | 2.9 | 37.874413 | 4.321560 | 105.9 |
# Scatterplots comparing features with life expectancy
fig, axes = plt.subplots(6, 3, figsize=(15, 15))
fig.subplots_adjust(hspace=0.4, wspace=0.4)
for x,column in enumerate(select_df.columns.difference(['region'])):
i=math.floor(x/3)
j=x%3
plot=sns.scatterplot(ax=axes[i,j],x=select_df['LE'], y= select_df[column],data=select_df,hue=select_df['region'])
handles, labels = axes[0,0].get_legend_handles_labels()
axes[i,j].get_legend().remove()
fig.legend(handles, labels, loc = 'upper left',title="Regions")
plt.savefig("../Images/Plots/Scatter_plot.png")
# Distribution plot for life expectancy
fig, ax = plt.subplots(figsize=(15, 15))
sns.histplot(data=select_df,x='LE',hue='region',element='step',stat='density',kde=True)
ax.set_xlabel("Life Expectancy")
sns.move_legend(ax,'upper left', title="Regions")
plt.savefig("../Images/Plots/Distribution_plot.png")
# Correlation matrix
fig, ax = plt.subplots(figsize=(15, 15))
sns.heatmap(select_df.corr(),cmap='coolwarm',annot=True)
plt.savefig("../Images/Plots/Correlation_matrix.png")
# Defining features and target
y=select_df["LE"]
X=select_df.drop(columns=["LE","region","ALC","BCG","DTP","GDPG","POPD","SR"])
# Splitting the data
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42)
# Normalize the data
X_scaler=MinMaxScaler().fit(X_train)
X_train_scaled=X_scaler.transform(X_train)
X_test_scaled=X_scaler.transform(X_test)
# Fitting the model
from sklearn.linear_model import LinearRegression
lm=LinearRegression()
lm.fit(X_train_scaled,y_train)
LinearRegression()
# Predicting the life expectancy values
y_pred=lm.predict(X_test_scaled)
# Performance of the model
from sklearn.metrics import r2_score,mean_squared_error
print (f"The coefficient of determination is {r2_score(y_test,y_pred):.3f}")
print (f"The mean squared error is {mean_squared_error(y_test,y_pred):.3f}")
The coefficient of determination is 0.995 The mean squared error is 0.462
# Training and Testing scores
training_score=lm.score(X_train_scaled,y_train)
testing_score=lm.score(X_test_scaled,y_test)
print(f"Training Score: {training_score:.3f}")
print(f"Testing Score: {testing_score:.3f}")
Training Score: 0.994 Testing Score: 0.995
# Plotting the residuals
plt.scatter(y_pred,y_pred-y_test)
plt.hlines(y=0,xmin=y_pred.min(),xmax=y_pred.max())
plt.xlabel("Predicted value")
plt.ylabel("Residuals")
Text(0, 0.5, 'Residuals')
# Calculating the intercept
print(f"The intercept is {lm.intercept_:.3f}")
The intercept is 58.169
# Calculating the coefficients
print("The coefficients are:")
for item in zip(X.columns,lm.coef_):
print(item)
The coefficients are:
('AMR', -8.195918557413135)
('CANP', 0.385422279500121)
('EDI', -45.67680881174396)
('GDP', -0.1613350050110072)
('HDI', 99.66561814157986)
('HE', 0.6816497256898723)
('INCI', -34.357711285092385)
('INFMR', -5.0999163854464165)
('MCV', 0.13247233027562352)
('OBP', -0.47133514317458614)
('POPG', 0.4122633830283033)
# Defining feature and target
y_s=df["status"]
X_s=df.drop(columns=["country","year","status","region","country_code"])
# Splitting the data
X_s_train,X_s_test,y_s_train,y_s_test=train_test_split(X_s,y_s,random_state=42)
# Standardizing the data
X_scaler=MinMaxScaler().fit(X_s_train)
X_s_train_scaled=X_scaler.transform(X_s_train)
X_s_test_scaled=X_scaler.transform(X_s_test)
# Fitting the model
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver='lbfgs',random_state=42)
lr.fit(X_s_train_scaled,y_s_train)
LogisticRegression(random_state=42)
predicted_status=lr.predict(X_s_test_scaled)
print(f"The accuracy of the model is : {accuracy_score(y_s_test,predicted_status):.3f}")
The accuracy of the model is : 0.950
print("The confusion matrix is as follows:")
print(confusion_matrix(y_s_test,predicted_status))
The confusion matrix is as follows: [[ 46 10] [ 18 481]]
print("The classification report is as follows:")
print(classification_report(y_s_test,predicted_status))
The classification report is as follows:
precision recall f1-score support
Developed 0.72 0.82 0.77 56
Developing 0.98 0.96 0.97 499
accuracy 0.95 555
macro avg 0.85 0.89 0.87 555
weighted avg 0.95 0.95 0.95 555
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_s_train_scaled,y_s_train)
RandomForestClassifier(random_state=42)
predicted_status_rf=rfc.predict(X_s_test_scaled)
print(f"The accuracy of the model is : {accuracy_score(y_s_test,predicted_status_rf):.3f}")
The accuracy of the model is : 1.000
print("The confusion matrix is as follows:")
print(confusion_matrix(y_s_test,predicted_status_rf))
The confusion matrix is as follows: [[ 56 0] [ 0 499]]
print("The classification report is as follows:")
print(classification_report(y_s_test,predicted_status_rf))
The classification report is as follows:
precision recall f1-score support
Developed 1.00 1.00 1.00 56
Developing 1.00 1.00 1.00 499
accuracy 1.00 555
macro avg 1.00 1.00 1.00 555
weighted avg 1.00 1.00 1.00 555
# Feature importance
sort_list=sorted(zip(X_s.columns,rfc.feature_importances_),key=lambda x: x[1],reverse=True)
labels, values = zip(*sort_list)
indexes = np.arange(len(labels))
width = 1
fig, ax = plt.subplots(figsize=(12, 12))
plt.bar(indexes,values,linewidth=1,tick_label=labels,edgecolor="black")
ax.set(title="Feature Importance",xlabel="Feature",ylabel="Importance")
plt.savefig("../Images/Plots/Bar_Plot.png")
# Reindexing dataframe
df=df.reset_index(drop=True)
# Tranforming country status
onc=OneHotEncoder(sparse=False)
encoded_df=pd.DataFrame(onc.fit_transform(df["status"].values.reshape(-1,1)))
encoded_df.columns=onc.get_feature_names_out(["status"])
encoded_df.tail()
| status_Developed | status_Developing | |
|---|---|---|
| 2212 | 0.0 | 1.0 |
| 2213 | 0.0 | 1.0 |
| 2214 | 0.0 | 1.0 |
| 2215 | 0.0 | 1.0 |
| 2216 | 0.0 | 1.0 |
# Merging the dataframes
cluster_df=df.join(encoded_df)
cluster_df.head()
| country_code | country | region | status | year | ALC | AMR | BCG | CANP | DTP | ... | INCI | INFMR | LE | MCV | OBP | POPD | POPG | SR | status_Developed | status_Developing | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AFG | Afghanistan | South Asia | Developing | 2000 | 0.002667 | 310.8305 | 30.0 | 0.502297 | 48.0 | ... | 0.333 | 90.5 | 55.841 | 27.0 | 2.3 | 31.829117 | 2.975057 | 105.9 | 0.0 | 1.0 |
| 1 | AFG | Afghanistan | South Asia | Developing | 2001 | 0.005333 | 304.8580 | 43.0 | 0.508536 | 59.0 | ... | 0.318 | 87.9 | 56.308 | 37.0 | 2.4 | 33.095904 | 3.902805 | 105.9 | 0.0 | 1.0 |
| 2 | AFG | Afghanistan | South Asia | Developing | 2002 | 0.008000 | 298.8855 | 46.0 | 0.512110 | 62.0 | ... | 0.386 | 85.3 | 56.784 | 35.0 | 2.6 | 34.618102 | 4.496719 | 105.9 | 0.0 | 1.0 |
| 3 | AFG | Afghanistan | South Asia | Developing | 2003 | 0.010667 | 292.0365 | 44.0 | 0.515965 | 66.0 | ... | 0.391 | 82.7 | 57.271 | 39.0 | 2.7 | 36.272510 | 4.668344 | 105.9 | 0.0 | 1.0 |
| 4 | AFG | Afghanistan | South Asia | Developing | 2004 | 0.013333 | 285.1880 | 51.0 | 0.520604 | 72.0 | ... | 0.389 | 80.0 | 57.772 | 48.0 | 2.9 | 37.874413 | 4.321560 | 105.9 | 0.0 | 1.0 |
5 rows × 25 columns
# Dropping unwanted columns
cluster_df.drop(columns=["country","year","status","country_code","region"],inplace=True)
cluster_df.head()
| ALC | AMR | BCG | CANP | DTP | EDI | GDP | GDPG | HDI | HE | INCI | INFMR | LE | MCV | OBP | POPD | POPG | SR | status_Developed | status_Developing | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.002667 | 310.8305 | 30.0 | 0.502297 | 48.0 | 0.235 | 4.055180e+09 | 3.868380 | 0.350 | 9.0 | 0.333 | 90.5 | 55.841 | 27.0 | 2.3 | 31.829117 | 2.975057 | 105.9 | 0.0 | 1.0 |
| 1 | 0.005333 | 304.8580 | 43.0 | 0.508536 | 59.0 | 0.247 | 4.055180e+09 | 3.868380 | 0.353 | 9.0 | 0.318 | 87.9 | 56.308 | 37.0 | 2.4 | 33.095904 | 3.902805 | 105.9 | 0.0 | 1.0 |
| 2 | 0.008000 | 298.8855 | 46.0 | 0.512110 | 62.0 | 0.259 | 4.055180e+09 | 3.868380 | 0.384 | 9.0 | 0.386 | 85.3 | 56.784 | 35.0 | 2.6 | 34.618102 | 4.496719 | 105.9 | 0.0 | 1.0 |
| 3 | 0.010667 | 292.0365 | 44.0 | 0.515965 | 66.0 | 0.271 | 4.515559e+09 | 3.868380 | 0.393 | 9.0 | 0.391 | 82.7 | 57.271 | 39.0 | 2.7 | 36.272510 | 4.668344 | 105.9 | 0.0 | 1.0 |
| 4 | 0.013333 | 285.1880 | 51.0 | 0.520604 | 72.0 | 0.302 | 5.226779e+09 | -2.875203 | 0.409 | 10.0 | 0.389 | 80.0 | 57.772 | 48.0 | 2.9 | 37.874413 | 4.321560 | 105.9 | 0.0 | 1.0 |
cluster_df.dtypes
ALC float64 AMR float64 BCG float64 CANP float64 DTP float64 EDI float64 GDP float64 GDPG float64 HDI float64 HE float64 INCI float64 INFMR float64 LE float64 MCV float64 OBP float64 POPD float64 POPG float64 SR float64 status_Developed float64 status_Developing float64 dtype: object
# Scaling the data
cluster_scaled=MinMaxScaler().fit_transform(cluster_df)
# Applying PCA to reduce dimensions from 21 to 4
from sklearn.decomposition import PCA
pca=PCA(n_components=4)
cluster_pca=pca.fit_transform(cluster_scaled)
# Checking the explained variance ratio - Variables will be adjusted to use fewer parameters in next iteration
pca.explained_variance_ratio_
array([0.53456714, 0.20126763, 0.0589989 , 0.05359823])
# Adding pca values to df
pca_df=pd.DataFrame(cluster_pca,columns=["PC1","PC2","PC3","PC4"])
# Identifying best value for k
from sklearn.cluster import KMeans
inertia=[]
k = list(range(1,11))
# Looping through definined k options
for value in k:
km = KMeans(n_clusters=value,random_state=42)
km.fit(pca_df)
inertia.append(km.inertia_)
# Elbow curve
elbow_df=pd.DataFrame({"k":k,"inertia":inertia})
elbow_df.plot(x="k",y="inertia",xlabel="Number of clusters",ylabel="Inertia")
C:\Users\DCANOWERKCOMPUTA\anaconda3\envs\mlenv\lib\site-packages\sklearn\cluster\_kmeans.py:1037: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=9. "KMeans is known to have a memory leak on Windows "
<AxesSubplot:xlabel='Number of clusters', ylabel='Inertia'>
# KMeans Clustering
kmodel=KMeans(n_clusters=3,random_state=42)
kmodel.fit(pca_df)
KMeans(n_clusters=3, random_state=42)
# Predicting the cluster
cluster=kmodel.predict(pca_df)
pca_df["class"]=kmodel.labels_
pca_df.head()
| PC1 | PC2 | PC3 | PC4 | class | |
|---|---|---|---|---|---|
| 0 | -1.128039 | 0.714938 | 0.684560 | 0.105317 | 0 |
| 1 | -1.053151 | 0.611978 | 0.480666 | 0.029218 | 0 |
| 2 | -0.988180 | 0.566578 | 0.493590 | 0.035084 | 0 |
| 3 | -0.946992 | 0.532654 | 0.463901 | 0.020235 | 0 |
| 4 | -0.867272 | 0.453745 | 0.333850 | -0.006954 | 0 |
combined_df=df.join(pca_df)
combined_df.head()
| country_code | country | region | status | year | ALC | AMR | BCG | CANP | DTP | ... | MCV | OBP | POPD | POPG | SR | PC1 | PC2 | PC3 | PC4 | class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AFG | Afghanistan | South Asia | Developing | 2000 | 0.002667 | 310.8305 | 30.0 | 0.502297 | 48.0 | ... | 27.0 | 2.3 | 31.829117 | 2.975057 | 105.9 | -1.128039 | 0.714938 | 0.684560 | 0.105317 | 0 |
| 1 | AFG | Afghanistan | South Asia | Developing | 2001 | 0.005333 | 304.8580 | 43.0 | 0.508536 | 59.0 | ... | 37.0 | 2.4 | 33.095904 | 3.902805 | 105.9 | -1.053151 | 0.611978 | 0.480666 | 0.029218 | 0 |
| 2 | AFG | Afghanistan | South Asia | Developing | 2002 | 0.008000 | 298.8855 | 46.0 | 0.512110 | 62.0 | ... | 35.0 | 2.6 | 34.618102 | 4.496719 | 105.9 | -0.988180 | 0.566578 | 0.493590 | 0.035084 | 0 |
| 3 | AFG | Afghanistan | South Asia | Developing | 2003 | 0.010667 | 292.0365 | 44.0 | 0.515965 | 66.0 | ... | 39.0 | 2.7 | 36.272510 | 4.668344 | 105.9 | -0.946992 | 0.532654 | 0.463901 | 0.020235 | 0 |
| 4 | AFG | Afghanistan | South Asia | Developing | 2004 | 0.013333 | 285.1880 | 51.0 | 0.520604 | 72.0 | ... | 48.0 | 2.9 | 37.874413 | 4.321560 | 105.9 | -0.867272 | 0.453745 | 0.333850 | -0.006954 | 0 |
5 rows × 28 columns
combined_df.hvplot.scatter(x="PC1",y="PC2",by="class",hover_cols=["country","year"])
class_df=combined_df.groupby(["region","class"])[["region"]].count()
print(class_df.to_string())
region
region class
East Asia & Pacific 0 99
1 235
2 25
Europe & Central Asia 0 5
1 261
2 208
Latin America & Caribbean 0 16
1 368
Middle East & North Africa 1 184
South Asia 0 78
1 50
Sub-Saharan Africa 0 599
1 89
combined_df[["country","class"]].value_counts()
country class
Afghanistan 0 16
Mauritania 0 16
Panama 1 16
Pakistan 0 16
Oman 1 16
..
Kenya 1 2
Azerbaijan 0 2
Cambodia 1 2
Kiribati 0 1
Micronesia (Federated States of) 0 1
Length: 159, dtype: int64